{
library(tidyverse)
library(lubridate)
library(plotly)
library(caret)
library(randomForest)
library(h2o)
library(highcharter)
library(hrbrthemes)
library(viridis)
library(purrr)
library(gridExtra)
library(formatR)
library(tidytext)
library(RColorBrewer)
library(DT)  
library(Amelia)
library(corrplot) 
library(missRanger)
}

Import the train and test dataset

train<-read.csv("Train.csv",header = T, na.strings = c(""," ","NA"))
test<-read.csv("Test.csv",header = T, na.strings = c(""," ","NA"))

Structure the Train data

str(train)
## 'data.frame':    23856 obs. of  18 variables:
##  $ INCIDENT_ID     : chr  "CR_102659" "CR_189752" "CR_184637" "CR_139071" ...
##  $ DATE            : chr  "04-JUL-04" "18-JUL-17" "15-MAR-17" "13-FEB-09" ...
##  $ X_1             : int  0 1 0 0 0 0 0 0 0 1 ...
##  $ X_2             : int  36 37 3 33 33 45 30 8 49 4 ...
##  $ X_3             : int  34 37 2 32 32 45 35 7 49 4 ...
##  $ X_4             : int  2 0 3 2 2 10 7 7 6 6 ...
##  $ X_5             : int  1 0 5 1 1 3 3 3 5 5 ...
##  $ X_6             : int  5 11 1 7 8 1 7 9 8 15 ...
##  $ X_7             : int  6 17 0 1 3 0 1 8 3 10 ...
##  $ X_8             : int  1 1 2 1 0 1 0 0 1 0 ...
##  $ X_9             : int  6 6 3 6 5 6 5 5 1 5 ...
##  $ X_10            : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ X_11            : int  174 236 174 249 174 303 174 316 316 145 ...
##  $ X_12            : num  1 1 1 1 0 1 0 1 1 1 ...
##  $ X_13            : int  92 103 110 72 112 72 112 72 103 103 ...
##  $ X_14            : int  29 142 93 29 29 62 29 62 14 29 ...
##  $ X_15            : int  36 34 34 34 43 34 43 34 34 34 ...
##  $ MULTIPLE_OFFENSE: int  0 1 1 1 1 1 1 1 1 0 ...

Summary of Train data

summary(train)
##  INCIDENT_ID            DATE                X_1              X_2       
##  Length:23856       Length:23856       Min.   :0.0000   Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:0.0000   1st Qu.: 7.00  
##  Mode  :character   Mode  :character   Median :0.0000   Median :24.00  
##                                        Mean   :0.4838   Mean   :24.79  
##                                        3rd Qu.:0.0000   3rd Qu.:36.00  
##                                        Max.   :7.0000   Max.   :52.00  
##                                                                        
##       X_3             X_4              X_5             X_6        
##  Min.   : 0.00   Min.   : 0.000   Min.   :0.000   Min.   : 1.000  
##  1st Qu.: 8.00   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.: 3.000  
##  Median :24.00   Median : 4.000   Median :3.000   Median : 5.000  
##  Mean   :24.64   Mean   : 4.277   Mean   :2.456   Mean   : 6.154  
##  3rd Qu.:35.00   3rd Qu.: 6.000   3rd Qu.:5.000   3rd Qu.: 8.000  
##  Max.   :52.00   Max.   :10.000   Max.   :5.000   Max.   :19.000  
##                                                                   
##       X_7              X_8               X_9             X_10       
##  Min.   : 0.000   Min.   : 0.0000   Min.   :0.000   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 0.0000   1st Qu.:5.000   1st Qu.: 1.000  
##  Median : 4.000   Median : 1.0000   Median :5.000   Median : 1.000  
##  Mean   : 4.877   Mean   : 0.9725   Mean   :4.924   Mean   : 1.245  
##  3rd Qu.: 7.000   3rd Qu.: 1.0000   3rd Qu.:6.000   3rd Qu.: 1.000  
##  Max.   :18.000   Max.   :99.0000   Max.   :6.000   Max.   :90.000  
##                                                                     
##       X_11          X_12              X_13             X_14       
##  Min.   :  0   Min.   : 0.0000   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:174   1st Qu.: 1.0000   1st Qu.: 72.00   1st Qu.: 29.00  
##  Median :249   Median : 1.0000   Median : 98.00   Median : 62.00  
##  Mean   :207   Mean   : 0.9741   Mean   : 85.24   Mean   : 72.67  
##  3rd Qu.:249   3rd Qu.: 1.0000   3rd Qu.:103.00   3rd Qu.:107.00  
##  Max.   :332   Max.   :90.0000   Max.   :116.00   Max.   :142.00  
##                NA's   :182                                        
##       X_15       MULTIPLE_OFFENSE
##  Min.   : 0.00   Min.   :0.0000  
##  1st Qu.:34.00   1st Qu.:1.0000  
##  Median :34.00   Median :1.0000  
##  Mean   :33.46   Mean   :0.9552  
##  3rd Qu.:34.00   3rd Qu.:1.0000  
##  Max.   :50.00   Max.   :1.0000  
## 

Structure of Test data

str(test)
## 'data.frame':    15903 obs. of  17 variables:
##  $ INCIDENT_ID: chr  "CR_195453" "CR_103520" "CR_196089" "CR_112195" ...
##  $ DATE       : chr  "01-FEB-18" "05-MAR-04" "27-JAN-18" "18-AUG-06" ...
##  $ X_1        : int  0 0 0 7 0 0 0 0 0 0 ...
##  $ X_2        : int  30 44 34 3 7 47 33 21 36 16 ...
##  $ X_3        : int  35 44 33 2 8 48 32 23 34 15 ...
##  $ X_4        : int  7 1 3 3 7 7 2 4 2 0 ...
##  $ X_5        : int  3 3 5 5 3 3 1 1 1 0 ...
##  $ X_6        : int  6 7 2 9 2 4 6 5 1 1 ...
##  $ X_7        : int  4 1 7 8 7 2 4 6 0 0 ...
##  $ X_8        : int  0 4 3 0 1 1 0 0 0 3 ...
##  $ X_9        : int  5 6 0 5 5 6 5 5 5 6 ...
##  $ X_10       : int  1 1 1 1 1 1 1 1 1 7 ...
##  $ X_11       : int  174 316 316 174 174 0 174 249 174 316 ...
##  $ X_12       : num  NA 0 1 1 0 0 0 1 0 7 ...
##  $ X_13       : int  72 12 72 112 112 34 103 92 92 72 ...
##  $ X_14       : int  119 29 0 87 93 29 103 93 93 29 ...
##  $ X_15       : int  23 34 34 34 43 34 43 34 48 34 ...

Summary of Test data

summary(test)
##  INCIDENT_ID            DATE                X_1              X_2       
##  Length:15903       Length:15903       Min.   :0.0000   Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:0.0000   1st Qu.: 7.00  
##  Mode  :character   Mode  :character   Median :0.0000   Median :24.00  
##                                        Mean   :0.4681   Mean   :24.72  
##                                        3rd Qu.:0.0000   3rd Qu.:36.00  
##                                        Max.   :7.0000   Max.   :52.00  
##                                                                        
##       X_3             X_4              X_5             X_6        
##  Min.   : 0.00   Min.   : 0.000   Min.   :0.000   Min.   : 1.000  
##  1st Qu.: 8.00   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.: 3.000  
##  Median :24.00   Median : 4.000   Median :3.000   Median : 5.000  
##  Mean   :24.58   Mean   : 4.284   Mean   :2.448   Mean   : 6.085  
##  3rd Qu.:35.00   3rd Qu.: 6.000   3rd Qu.:5.000   3rd Qu.: 8.000  
##  Max.   :52.00   Max.   :10.000   Max.   :5.000   Max.   :19.000  
##                                                                   
##       X_7              X_8               X_9             X_10       
##  Min.   : 0.000   Min.   : 0.0000   Min.   :0.000   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 0.0000   1st Qu.:5.000   1st Qu.: 1.000  
##  Median : 4.000   Median : 1.0000   Median :5.000   Median : 1.000  
##  Mean   : 4.863   Mean   : 0.9867   Mean   :4.909   Mean   : 1.241  
##  3rd Qu.: 7.000   3rd Qu.: 1.0000   3rd Qu.:6.000   3rd Qu.: 1.000  
##  Max.   :18.000   Max.   :50.0000   Max.   :6.000   Max.   :40.000  
##                                                                     
##       X_11          X_12              X_13             X_14       
##  Min.   :  0   Min.   : 0.0000   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:174   1st Qu.: 1.0000   1st Qu.: 72.00   1st Qu.: 29.00  
##  Median :249   Median : 1.0000   Median : 98.00   Median : 62.00  
##  Mean   :207   Mean   : 0.9722   Mean   : 85.19   Mean   : 72.22  
##  3rd Qu.:249   3rd Qu.: 1.0000   3rd Qu.:103.00   3rd Qu.:107.00  
##  Max.   :332   Max.   :40.0000   Max.   :117.00   Max.   :142.00  
##                NA's   :127                                        
##       X_15      
##  Min.   : 0.00  
##  1st Qu.:34.00  
##  Median :34.00  
##  Mean   :33.42  
##  3rd Qu.:34.00  
##  Max.   :50.00  
## 

To make a exploratory data analysis task more informative, recode the target variable(MULTIPLE_OFFENSE) levels "1" as Yes and "0" as No.

train$MULTIPLE_OFFENSE <- ifelse(train$MULTIPLE_OFFENSE == 1, "Yes", "No")

The following plot shows Number hack attacks on digital payments from the year 1991 to 2018.

train %>% group_by(MULTIPLE_OFFENSE) %>% summarise(counts = n()) %>%  plot_ly(
  labels = ~ MULTIPLE_OFFENSE,
  values = ~ counts,
  type = 'pie'
  ) %>% layout(legend = list(
  orientation = "h",
  xanchor = "center",
  x = 0.5
  ))
## `summarise()` ungrouping output (override with `.groups` argument)

Let visualize the train & test data columns from X_1 to X_15 and see the data distribution of them

Train data

ggplotly(
  train %>% select(-MULTIPLE_OFFENSE) %>%
  gather(measurement, value, X_1:X_15, factor_key = T) %>%
  ggplot(aes(x = value, fill = measurement)) + geom_histogram()  +
  facet_wrap(~ measurement , scales = "free" , ncol = 3) + theme_light() +
  theme(
  legend.position = "none",
  panel.spacing = unit(2, "lines"),
  axis.title.x = element_blank(),
  axis.title.y = element_blank()
  )
  )

Test data

ggplotly(
  test %>%
  gather(measurement, value, X_1:X_15, factor_key = T) %>%
  ggplot(aes(x = value, fill = measurement)) + geom_histogram()  +
  facet_wrap(~ measurement, scales = "free", ncol = 3) + theme_light() +
  theme(
  legend.position = "none",
  panel.spacing = unit(2, "lines"),
  axis.title.x = element_blank(),
  axis.title.y = element_blank()
  )
  )


Train and Test data are having date column so here to perform some feature engineering task to extract the valuable information from the date column.Before that to combine the train and test data and stored it in a new dataframe.

fulldata <- merge(train, test, all = T)

The format of date in dataset is character so convert it to date type

fulldata$DATE  <-dmy(fulldata$DATE)
fulldata<-data.frame(fulldata)

Next extract the features like day, month, year, week ,weekday,quarter and leap year from date column

{
  fulldata$date_day <- lubridate::day(fulldata$DATE)#extract day
  fulldata <- fulldata %>%  select(1:2, 19, everything())#column reorder
  
  fulldata$date_month <- lubridate::month(fulldata$DATE, label=T)#extract month
  fulldata <- fulldata %>%  select(1:3, 20, everything())#column reorder
  
  fulldata$date_year <- lubridate::year(fulldata$DATE)#extract year
  fulldata <- fulldata %>%  select(1:4, 21, everything())#column reorder
  
  fulldata <- fulldata %>% group_by(DATE) %>% mutate(date_count = n())#Date count
  fulldata <- fulldata %>%  select(1:2, 22, everything())#column reorder
  
  
  fulldata$date_quarter <- lubridate::quarter(fulldata$DATE)#extract quarter
  fulldata <- fulldata %>%  select(1:6, 23, everything())#column reorder
  
  fulldata$date_week <- lubridate::week(fulldata$DATE)#extract week
  fulldata <- fulldata %>%  select(1:7, 24, everything())#column reorder
  
  fulldata$date_wday <- lubridate::wday(fulldata$DATE , label=T )#extract weekday
  fulldata <- fulldata %>%  select(1:8, 25, everything())#column reorder
  
  
  fulldata$date_leap_year <- lubridate::leap_year(fulldata$DATE)#extract leap year info
  fulldata <- fulldata %>%  select(1:9, 26, everything())#column reorder
}
datatable(head(fulldata[,c(2:10)]), class = 'cell-border stripe')

The following time series chart explains how the number of hack attacks on digital payments changed over the time.

#filtering train data from fulldata
highchart() %>%
  hc_add_series(
  data =
  fulldata %>% filter(!is.na(MULTIPLE_OFFENSE)) %>%
  group_by(date_year, MULTIPLE_OFFENSE) %>%
  summarise(counts = n()) ,
  type = "line",
  hcaes(x = date_year,
  y = counts,
  group = MULTIPLE_OFFENSE)
  )

lets break down the time series plot into various levels and see how the number of hack attacks on digital payments changed in weeks, months, quarters and leap years from the year 1991 to 2018.

The following visualization explains the number hack attacks in each month over the year

ggplotly(
  fulldata %>% drop_na(MULTIPLE_OFFENSE) %>%
  filter(MULTIPLE_OFFENSE == 'Yes') %>%
  group_by(date_month, date_year, MULTIPLE_OFFENSE) %>%
  summarise(counts = n())  %>%
  ggplot(
  aes(
  x = reorder_within(date_month,-counts,
  date_year),
  y = counts,
  fill = date_month,
  text = paste("Month:",
  date_month,
  "<br> counts:",
  counts)
  )
  ) +
  geom_bar(stat = "identity",
  position = "stack") + scale_fill_brewer(palette = "Set3") + xlab("Month") +
  facet_wrap( ~ date_year , scales = "free" , ncol = 3) +
  scale_x_reordered() +
  theme(
  panel.spacing = unit(1, "lines"),
  axis.text.x = element_blank(),
  axis.title.x = element_text(face = "bold", hjust = 0.5),
  axis.title.y = element_blank()
  ) ,
  tooltip = "text"
  ) %>%
  layout(legend = list(
  orientation = "h",
  x = 0,
  y = -0.05
  ))

The following visualization explains the number hack attacks in each quarter over the year.In the 2nd & 3rd quarter, more number of hack attacks occurred

ggplotly(
  fulldata %>% drop_na(MULTIPLE_OFFENSE) %>%
  filter(MULTIPLE_OFFENSE == 'Yes') %>%
  dplyr::group_by(date_quarter, date_year, MULTIPLE_OFFENSE) %>%
  summarise(counts = n())  %>%
  ggplot(aes(
  x = reorder_within(date_quarter,-counts,
  date_year),
  y = counts,
  fill = factor(date_quarter),
  text = paste("Quarter:",
  date_quarter,
  "<br> counts:",
  counts)
  )) +
  geom_bar(stat = "identity",
  position = "stack") + scale_fill_brewer(palette = "Accent") + xlab("Quarter") +
  facet_wrap( ~ date_year , scales = "free" , ncol = 3) +
  scale_x_reordered() +
  theme(
  panel.spacing = unit(1, "lines"),
  axis.text.x = element_text(fac = "bold"),
  axis.title.x = element_text(face = "bold", hjust = 0.5),
  axis.title.y = element_blank()
  ),
  tooltip = "text"
  )  %>%
  layout(legend = list(
  orientation = "h",
  x = 0.3,
  y = -0.05
  ))  

The following plot explains number hack attacks in each day of the week over the year, the most hack attacks oocured in week days.

ggplotly(
  fulldata %>% drop_na(MULTIPLE_OFFENSE) %>%
  filter(MULTIPLE_OFFENSE == 'Yes') %>%
  group_by(date_wday, date_year, MULTIPLE_OFFENSE) %>%
  summarise(counts = n())  %>%
  ggplot(aes(
  x = reorder_within(date_wday,-counts,
  date_year),
  y = counts,
  fill = date_wday,
  text = paste("Week Day:",
  date_wday,
  "<br> counts:",
  counts)
  )) +
  geom_bar(stat = "identity",
  position = "stack") + scale_fill_brewer(palette = "Set2") + xlab("Week Day") +
  facet_wrap( ~ date_year , scales = "free" , ncol = 3) +
  scale_x_reordered() +
  theme(
  panel.spacing = unit(1, "lines"),
  axis.text.x = element_blank(),
  axis.title.x = element_text(face = "bold", hjust = 0.5),
  axis.title.y = element_blank()
  ),
  tooltip = "text"
  ) %>%
  layout(legend = list(
  orientation = "h",
  x = 0.2,
  y = -0.05
  ))

Ordinary Year - Leap Year hack attacks

fulldata %>% drop_na(MULTIPLE_OFFENSE) %>%
  filter(MULTIPLE_OFFENSE == 'Yes') %>%
  group_by(date_leap_year, date_year) %>%
  summarise(counts = n())  %>%
  ggplot(aes(
  x = date_year,
  y = counts,
  label = date_year,
  color = date_leap_year
  )) +
  geom_point() +
  geom_text(aes(color = factor(date_leap_year))) +
  geom_line(aes(
  x = date_year,
  y = counts,
  color = date_leap_year,
  group = 1
  ),
  size = 1) +
  xlab("Year") +
  scale_x_continuous(breaks = seq(1991, 2018, by = 1)) +
  labs(color = "Leap Year\n") +
  theme(
  legend.position = "bottom",
  axis.text.x = element_blank(),
  axis.title.y = element_blank()
  )

The X_1 to X_15 features are anonymous so not able to identify which one is quantitative and qualitative feature.Lets assume the features X_1 to X_15 as continuous type and see the correlation of those features.

Train data

cor<-cor( train %>% na.omit() %>%  select_if(is.numeric ))
corrplot(cor, method = "circle")

Test data

cor1<-cor( test %>% na.omit() %>%  select_if(is.numeric ))
corrplot(cor1, method = "circle")

In the above corrrelation plot blue color represents positive correlation and red color represents negative correlation ,the color intensity explains how much the features are correlated from -1 to +1.(0 means no correlation,1 means positive correlation, -1 means negative correlation).
There is a strong positive correlation between X_2 and X_3 , X_10 and X_12.

lets visualize the Missing values from train and test data


Train data

missmap(train[,-c(1,2,18)])

Test data

missmap(test[,-c(1,2)])


Using missRanger package to impute missing values.

fulldataimp<-missRanger(fulldata[,-c(1,2,26)], num.trees = 100,verbose = T,
                        seed = 887,pmm.k = 4)
## 
## Missing value imputation by random forests
## 
##   Variables to impute:       X_12
##   Variables used to impute:  date_count, date_day, date_month, date_year, date_quarter, date_week, date_wday, date_leap_year, X_1, X_2, X_3, X_4, X_5, X_6, X_7, X_8, X_9, X_10, X_11, X_12, X_13, X_14, X_15
## iter 1:  .
## iter 2:  .
## iter 3:  .

insert incident id to new data frame

fulldataimp$INCIDENT_ID<-fulldata$INCIDENT_ID

fulldataimp<-fulldataimp %>% select(24,everything())

insert MULTIPLE_OFFENSE to new data frame

fulldataimp$MULTIPLE_OFFENSE<-fulldata$MULTIPLE_OFFENSE

Lets recode the MULTIPLE_OFFENSE by 1 and 0

fulldataimp$MULTIPLE_OFFENSE<-as.factor(ifelse(fulldataimp$MULTIPLE_OFFENSE=='Yes',1,0))

lets split the data into Train set, validation set, Test set

Train set

trainn<-fulldataimp[complete.cases(fulldataimp$MULTIPLE_OFFENSE),]

Test set

testn<-fulldataimp[!complete.cases(fulldataimp$MULTIPLE_OFFENSE),]

Create a stratified sample train and validation set data from trainn dataframe

index<-createDataPartition(y=trainn$MULTIPLE_OFFENSE ,p=0.75,list = F)

trains<-trainn[index,]

vals<-trainn[-index,]

Classify hack attacks by using Random forest algorithm , removed high correlated feature X_2 and X_10

rfmod<-randomForest(factor(MULTIPLE_OFFENSE)~., data=trains[,c(10,11,13:20,22:25)], strata=MULTIPLE_OFFENSE,
                importance=T,ntree=500,mtry=4,replace=F)

Evaluate the model performance by using Validation dataset

valpre<-predict(rfmod,vals)

create a confusion matrix and see how its predicted the actual classes

cm<-confusionMatrix(valpre, factor(vals$MULTIPLE_OFFENSE),mode="everything") 

Validation set accuaracy

cm$byClass[7] 
##        F1 
## 0.9608939